In [16]:
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline as py
py.init_notebook_mode(connected=False)
In [17]:
CONFIRMED_PATH = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
DEATH_PATH = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
RECOVERED_PATH = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
In [18]:
def get_data_df(data_path):
    
    data_df = pd.read_csv(data_path)
    data_df = data_df.rename(columns={'Province/State': 'province_or_state', 'Country/Region': 'country'})
    data_df['province_or_state'] = data_df['province_or_state'].fillna('')
    data_df = data_df.drop(data_df[data_df['province_or_state'].str.contains('Diamond Princess')].index)
    data_df = data_df.drop(data_df[data_df['country'].str.contains('Diamond Princess')].index)
    agg_spec = {k : 'sum' for k in data_df.columns[4:]}
    data_df = data_df.groupby(['country']).agg(agg_spec).reset_index()
    
    return data_df
In [19]:
def get_weather_df():
    
    weather_df = pd.read_csv('yearly_temp.csv', sep='\t')
    weather_df = weather_df.rename(columns=lambda x: x.strip())
    weather_df['country'] = weather_df['country'].str.strip()
    weather_df['temp'] = pd.to_numeric(weather_df['temp'])
    
    return weather_df
In [20]:
def get_population_df():

    population_df = pd.read_csv('population_data.csv', sep='\t')
    population_df['Med. Age'] = population_df['Med. Age'].str.replace(',', '').str.replace('N.A', '0').astype(float)
    population_df['Urban Pop %'] = population_df['Urban Pop %'].str.replace(',', '').str.replace('N.A', '0').astype(float)
    
    return population_df
In [21]:
def join_data_df_weather(df):
    
    weather_df = get_weather_df()
    drop_countries = set(df['country'].unique()) - set(weather_df['country'].unique())
    df = df.join(weather_df.set_index('country'), on='country', how='left')
    df['hot'] = np.where(df['temp'] > 15.0, True, False)
    df = df.drop(df[df['country'].isin(drop_countries)].index)   

    return df
In [22]:
def join_data_df_population(df):

    population_df = get_population_df()
    drop_countries = set(df['country'].unique()) - set(population_df['country'].unique())
    df = df.join(population_df.set_index('country'), on='country', how='left')
    df = df.drop(df[df['country'].isin(drop_countries)].index)   

    return df
In [23]:
COUNTRIES_OF_INTEREST = ['Italy', 'France', 'China', 'United Kingdom', 'US', 'Germany', 'Spain', 
                         'Japan', 'Israel', 'Netherlands', 'Korea, South']
In [24]:
data_df = get_data_df(CONFIRMED_PATH)
In [25]:
data_df_columns = data_df.columns
In [26]:
weather_population_columns = list(get_weather_df().columns[1:]) + ['hot'] + list(get_population_df().columns[1:])
In [27]:
data_df_t = data_df.melt(id_vars=['country'], var_name='date', value_name='confirmed').fillna('<all>')
data_of_interest = data_df_t[data_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
In [28]:
fig = px.line(data_of_interest, x="date", y="confirmed", color='country', log_y=True)
fig.show()
In [29]:
diff_df = data_df.copy()
for i in range(len(data_df.columns)-1, 1, -1):
    diff_df.iloc[:, i] = (data_df.iloc[:, i] - data_df.iloc[:, i-1]) / data_df.iloc[:, i-1]

diff_df = diff_df.fillna(0.0)
In [30]:
diff_df_t = diff_df.melt(id_vars=['country'], var_name='date', value_name='new').fillna('<all>')
diff_of_interest = diff_df_t[diff_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
c = diff_of_interest.groupby('country').cumcount()
week_of_interest = diff_of_interest.groupby(['country', c // 7]).agg({'date': 'first', 'new': 'sum'}).reset_index()
In [31]:
fig = px.line(week_of_interest, x="date", y="new", color='country', log_y=True)
fig.show()
In [32]:
data_df = join_data_df_weather(data_df)
In [33]:
data_df = join_data_df_population(data_df)
In [34]:
data_norm_df = data_df.copy()
for i in range(len(data_df.columns)-len(weather_population_columns), 0, -1):
    data_norm_df.iloc[:, i] = pd.to_numeric(data_df.iloc[:, i]) / pd.to_numeric(data_df['Population (2020)'])
In [35]:
data_norm_df_t = data_norm_df.drop(columns=weather_population_columns)
data_norm_df_t = data_norm_df_t.melt(id_vars=['country'], var_name='date', value_name='confirmed').fillna('<all>')
norm_of_interest = data_norm_df_t[data_norm_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
In [36]:
fig = px.line(norm_of_interest, x="date", y="confirmed", color='country', log_y=True)
fig.show()
In [37]:
death_df = get_data_df(DEATH_PATH)
death_df = join_data_df_weather(death_df)
death_df = join_data_df_population(death_df)
In [38]:
death_norm_df = death_df.copy()
for i in range(len(death_df.columns)-len(weather_population_columns), 0, -1):
    death_norm_df.iloc[:, i] = pd.to_numeric(death_df.iloc[:, i]) / pd.to_numeric(death_df['Population (2020)'])
In [39]:
death_norm_df_t = death_norm_df.drop(columns=weather_population_columns)
death_norm_df_t = death_norm_df_t.melt(id_vars=['country'], var_name='date', value_name='deaths').fillna('<all>')
death_norm_of_interest = death_norm_df_t[death_norm_df_t['country'].isin(COUNTRIES_OF_INTEREST)]
In [40]:
fig = px.line(death_norm_of_interest, x="date", y="deaths", color='country', log_y=True)
fig.show()
In [41]:
recovered_df = get_data_df(RECOVERED_PATH)
recovered_df = join_data_df_weather(recovered_df)
recovered_df = join_data_df_population(recovered_df)
In [42]:
active_df = data_df.copy()
for i in range(len(data_df.columns)-len(weather_population_columns), 0, -1):
    active_df.iloc[:, i] = pd.to_numeric(active_df.iloc[:, i]) - pd.to_numeric(death_df.iloc[:, i]) - pd.to_numeric(recovered_df.iloc[:, i])
In [43]:
active_df_norm_t = active_df.drop(columns=weather_population_columns)
active_df_norm_t = active_df_norm_t.melt(id_vars=['country'], var_name='date', value_name='active').fillna('<all>')
active_norm_of_interest = active_df_norm_t[active_df_norm_t['country'].isin(COUNTRIES_OF_INTEREST)]
In [44]:
fig = px.line(active_norm_of_interest, x="date", y="active", color='country', log_y=True)
fig.show()
In [45]:
active_df.loc["world", data_df_columns[1:]] = active_df[data_df_columns[1:]].sum()
active_df.at['world', 'country'] = 'world'
active_df_norm_t = active_df.drop(columns=weather_population_columns)
active_df_norm_t = active_df_norm_t.melt(id_vars=['country'], var_name='date', value_name='active').fillna('<all>')
active_norm_of_interest = active_df_norm_t[active_df_norm_t['country'].isin(['world'])]
In [46]:
fig = px.line(active_norm_of_interest, x="date", y="active", color='country', log_y=True)
fig.show()
In [47]:
# hot_cold_df = data_df[list(data_df_columns) + ['temp', 'hot']].groupby('hot').agg({k : 'sum' for k in data_df.columns[1:len(data_df_columns)]}).reset_index()
In [48]:
hot_cold_df_t = data_df[list(data_df_columns)[1:] + ['hot']].melt(id_vars=['hot'], var_name='date', value_name='confirmed').fillna('<all>')
In [49]:
fig = px.line(hot_cold_df_t, x="date", y="confirmed", color='hot')
fig.show()
In [50]:
static_columns = ['country', data_df_columns[-1]] + weather_population_columns
In [51]:
data_df_scatter = data_df[static_columns]
data_df_scatter = data_df_scatter.rename(columns={data_df_columns[-1]: 'confirmed'})
data_df_scatter.drop(data_df_scatter[pd.to_numeric(data_df_scatter['Population (2020)'] < 100000)].index).sort_values(by=['confirmed'], ascending=False).head(n=10)
Out[51]:
country confirmed temp hot Population (2020) Density (P/Km²) Land Area (Km²) Migrants (net) Med. Age Urban Pop %
170 US 607670 8.55 False 331002651 36 9147420 954806.0 38.0 83.0
155 Spain 172541 13.30 False 46754778 94 498800 40000.0 45.0 80.0
83 Italy 162488 13.45 False 60461826 206 294140 148943.0 47.0 69.0
60 France 131361 10.70 False 65273511 119 547557 36527.0 42.0 82.0
64 Germany 131359 8.50 False 83783942 240 348560 543822.0 46.0 76.0
174 United Kingdom 94845 8.45 False 67886011 281 241930 260650.0 40.0 83.0
36 China 83306 6.95 False 1439323776 153 9388211 -348399.0 38.0 61.0
79 Iran 74877 17.25 True 83992949 52 1628550 -55000.0 32.0 76.0
169 Turkey 65111 11.10 False 84339067 110 769630 283922.0 32.0 76.0
16 Belgium 31119 9.55 False 11589623 383 30280 48000.0 42.0 98.0
In [52]:
data_norm_df = data_norm_df[static_columns]
data_norm_df = data_norm_df.rename(columns={data_df_columns[-1]: 'confirmed'})
data_norm_df.drop(data_norm_df[pd.to_numeric(data_norm_df['Population (2020)'] < 100000)].index).sort_values(by=['confirmed'], ascending=False).head(n=10)
Out[52]:
country confirmed temp hot Population (2020) Density (P/Km²) Land Area (Km²) Migrants (net) Med. Age Urban Pop %
100 Luxembourg 0.005283 1.381838e-05 False 625978 242 2590 9741.0 40.0 88.0
76 Iceland 0.005040 5.128310e-06 False 341243 3 100250 380.0 37.0 94.0
155 Spain 0.003690 2.844629e-07 False 46754778 94 498800 40000.0 45.0 80.0
160 Switzerland 0.002997 6.354986e-07 False 8654622 219 39516 52000.0 43.0 74.0
83 Italy 0.002687 2.224544e-07 False 60461826 206 294140 148943.0 47.0 69.0
16 Belgium 0.002685 8.240130e-07 False 11589623 383 30280 48000.0 42.0 98.0
81 Ireland 0.002325 1.883435e-06 False 4937786 72 68890 23604.0 38.0 63.0
60 France 0.002012 1.639256e-07 False 65273511 119 547557 36527.0 42.0 82.0
170 US 0.001836 2.583061e-08 False 331002651 36 9147420 954806.0 38.0 83.0
134 Portugal 0.001711 1.485773e-06 True 10196709 111 91590 -6000.0 46.0 66.0
In [53]:
fig = px.scatter(data_norm_df, x="Urban Pop %", y="confirmed", hover_data=['country'], log_x=True, log_y=True)
fig.show()
In [54]:
fig = px.scatter(data_norm_df, x="Density (P/Km²)", y="confirmed", hover_data=['country'], log_x=True, log_y=True)
fig.show()
In [55]:
fig = px.scatter(data_norm_df, x="Med. Age", y="confirmed", hover_data=['country'], log_x=True, log_y=True)
fig.show()
In [56]:
death_df_static = death_norm_df[static_columns].copy()
death_df_static = death_df_static.rename(columns={data_df_columns[-1]: 'deaths'})
In [57]:
fig = px.scatter(death_df_static, x="Med. Age", y="deaths", hover_data=['country'], log_x=True, log_y=True)
fig.show()
In [58]:
fig = px.scatter(death_df_static, x="Urban Pop %", y="deaths", hover_data=['country'], log_x=True, log_y=True,
                size_max=60, color='country', size='Population (2020)')
fig.show()
In [59]:
fig = px.scatter(death_df_static, x="Density (P/Km²)", y="deaths", hover_data=['country'], log_x=True, log_y=True)
fig.show()